Data Preparation
## Format data
claims = claims_df %>%
pivot_longer(-id, names_to = "time", values_to = "agg_claim") %>%
transmute(id = as.character(id),
time = excel_numeric_to_date(as.numeric(time),
date_system = "modern" ),
agg_claim)
er = er_df %>%
pivot_longer(-id, names_to = "time", values_to = "count") %>%
transmute(id = as.character(id),
time = excel_numeric_to_date(as.numeric(time),
date_system = "modern" ),
er_count = count)
pcp = pcp_df %>%
pivot_longer(-id, names_to = "time", values_to = "count") %>%
transmute(id = as.character(id),
time = excel_numeric_to_date(as.numeric(time),
date_system = "modern" ),
pcp_count = count)
hospital = hospital_df %>%
pivot_longer(-id, names_to = "time", values_to = "count") %>%
transmute(id = as.character(id),
time = excel_numeric_to_date(as.numeric(time),
date_system = "modern" ),
hospital_count = count)
rx = rx_df %>%
pivot_longer(-id, names_to = "time", values_to = "count") %>%
transmute(id = as.character(id),
time = excel_numeric_to_date(as.numeric(time),
date_system = "modern" ),
rx_count = count)
## Check counts
er %>% group_by(er_count) %>%
count() %>% arrange()
## # A tibble: 4 x 2
## # Groups: er_count [4]
## er_count n
## <dbl> <int>
## 1 0 340056
## 2 1 12684
## 3 2 250
## 4 3 2
pcp %>% group_by(pcp_count) %>%
count() %>% arrange()
## # A tibble: 5 x 2
## # Groups: pcp_count [5]
## pcp_count n
## <dbl> <int>
## 1 0 317878
## 2 1 33273
## 3 2 1767
## 4 3 72
## 5 4 2
hospital %>% group_by(hospital_count) %>%
count() %>% arrange()
## # A tibble: 3 x 2
## # Groups: hospital_count [3]
## hospital_count n
## <dbl> <int>
## 1 0 345720
## 2 1 7210
## 3 2 62
rx %>% group_by(rx_count) %>%
count() %>% arrange()
## # A tibble: 6 x 2
## # Groups: rx_count [6]
## rx_count n
## <dbl> <int>
## 1 0 286532
## 2 1 59431
## 3 2 6515
## 4 3 485
## 5 4 28
## 6 5 1
claims %>% group_by(agg_claim) %>%
count() %>% arrange() # assume mistyped a minus sign in front
## # A tibble: 148,856 x 2
## # Groups: agg_claim [148,856]
## agg_claim n
## <dbl> <int>
## 1 -8.05e+ 2 1
## 2 -6.64e+ 2 1
## 3 -3.21e+ 2 1
## 4 -1.00e-13 1
## 5 0. 138787
## 6 3.13e+ 1 1
## 7 3.30e+ 1 1
## 8 3.48e+ 1 1
## 9 3.52e+ 1 1
## 10 3.57e+ 1 1
## # … with 148,846 more rows
## Check: each individual either get one or none CRD
demo_df %>% mutate(test_sum = transport+fin_assis+lonely+food_insec) %>%
filter(test_sum > 1)
## # A tibble: 0 x 13
## # … with 13 variables: id <chr>, transport <dbl>, fin_assis <dbl>,
## # lonely <dbl>, food_insec <dbl>, region <chr>, rural <chr>,
## # is_lowincome <chr>, age <dbl>, chronic_count <dbl>, gender <chr>,
## # mth_out <chr>, test_sum <dbl>
## Check: region
demo_df %>% group_by(region) %>%
count() %>% arrange(n) # unknown small, ignore
## # A tibble: 6 x 2
## # Groups: region [6]
## region n
## <chr> <int>
## 1 Unknown 7
## 2 Northwest 777
## 3 Southwest 857
## 4 Northeast 1480
## 5 Central 1925
## 6 Southeast 2308
## Check: rural
demo_df %>% group_by(rural) %>%
count() %>% arrange(n) # unknown small, ignore
## # A tibble: 5 x 2
## # Groups: rural [5]
## rural n
## <chr> <int>
## 1 Unknown 7
## 2 Rural 605
## 3 Semi-Rural 1414
## 4 Suburban 2283
## 5 Urban 3045
## Check: is_lowincome
demo_df %>% group_by(is_lowincome) %>%
count() %>% arrange(n) # inconsistant label, need to fix
## # A tibble: 5 x 2
## # Groups: is_lowincome [5]
## is_lowincome n
## <chr> <int>
## 1 YES 1
## 2 No 3
## 3 Yes 3
## 4 Y 3048
## 5 N 4299
## Check: age
demo_df %>% group_by(age) %>% count() %>% filter(age<5) %>% arrange(age) # need to correct negative values, assume mistyped minues sign
## # A tibble: 14 x 2
## # Groups: age [14]
## age n
## <dbl> <int>
## 1 -25 1
## 2 -21 1
## 3 -18 1
## 4 -17 1
## 5 -14 1
## 6 -12 1
## 7 -11 1
## 8 -8 1
## 9 -7 1
## 10 -5 1
## 11 -2 2
## 12 -1 1
## 13 2 1
## 14 4 1
demo_df %>% group_by(age) %>% count() %>% filter(age>95) %>% arrange(age) # need to correct >100 values, assume mistyped a 1 in the front
## # A tibble: 10 x 2
## # Groups: age [10]
## age n
## <dbl> <int>
## 1 98 2
## 2 100 1
## 3 101 1
## 4 104 1
## 5 151 1
## 6 160 1
## 7 165 1
## 8 167 1
## 9 171 2
## 10 175 1
## Check: chronic_count
demo_df %>% group_by(chronic_count) %>% count() %>% arrange(chronic_count) # assume mistyped a 1 in the front
## # A tibble: 10 x 2
## # Groups: chronic_count [10]
## chronic_count n
## <dbl> <int>
## 1 0 1299
## 2 1 1503
## 3 2 1812
## 4 3 1477
## 5 4 844
## 6 5 336
## 7 6 71
## 8 7 9
## 9 10 1
## 10 11 2
## Check: gender
demo_df %>% group_by(gender) %>% count() %>% arrange(n) # will just ignore other than M, F
## # A tibble: 5 x 2
## # Groups: gender [5]
## gender n
## <chr> <int>
## 1 W 1
## 2 U 2
## 3 X 2
## 4 M 3553
## 5 F 3796
demo = demo_df %>%
mutate(is_lowincome = demo_df$is_lowincome %in% c("Y","Yes","YES"),
age = ifelse(age>110, age-100, abs(age)),
mth_out = decimal_date(mdy(paste(mth_out,"1","2017"))),
chronic_count = ifelse(chronic_count>=10,
chronic_count-10,
chronic_count)) %>%
filter(region != "Unknown" & rural != "Unknown" & gender %in% c("M","F"))
## Add a column indicate which CRD an individual received
crd = NA
for(i in 1:length(demo$id)){
crd[i]=0
if(demo$transport[i] == 1){
crd[i] = 1
}
if(demo$fin_assis[i] == 1){
crd[i] = 2
}
if(demo$lonely[i] == 1){
crd[i] = 3
}
if(demo$food_insec[i] == 1){
crd[i] = 4
}
}
demo = demo %>% cbind(crd)
## Check: all NAs represent people from the control group
demo %>% filter(is.na(mth_out),
crd !=0)
## [1] id transport fin_assis lonely food_insec
## [6] region rural is_lowincome age chronic_count
## [11] gender mth_out crd
## <0 rows> (or 0-length row.names)
## Join utilization and claim tables
loss = er %>%
inner_join(hospital, by = c("id"="id","time"="time")) %>%
inner_join(pcp, by = c("id"="id","time"="time")) %>%
inner_join(rx, by = c("id"="id","time"="time")) %>%
mutate(total_count = er_count+hospital_count+pcp_count+rx_count) %>%
inner_join(claims, by = c("id"="id","time"="time")) %>%
mutate(agg_claim = abs(agg_claim),
avg_claim = ifelse(agg_claim*total_count!=0,agg_claim/total_count,0))
## Convert date to decimal year
loss = loss %>% mutate(time = decimal_date(time))
## Reduction
demo = demo %>%
select(id,region,rural,is_lowincome,age,chronic_count,gender,mth_out,crd)
remove(claims,claims_df,demo_df,er,er_df,hospital,hospital_df,pcp,pcp_df,rx,rx_df,crd,i)
## Join demo(features) with loss(responses)
impact = demo %>% inner_join(loss, by = "id")
## Add a column to indicate if row is after_crd
impact = impact %>%
mutate(after_crd = ifelse(!is.na(mth_out),time >= mth_out,NA))
#impact = impact %>% select(-mth_out)
Misc.
tmp = control
log_claim = log(tmp$agg_claim)
#tmp %>% ggplot() + geom_density(aes(x=log_claim))
#tmp %>% ggplot() + geom_bar(aes(x=total_count))
tmp %>% ggplot() + geom_point(aes(x=time,y=log_claim,color=factor(er_count)),alpha=0.1)

tmp %>% ggplot() + geom_point(aes(x=time,y=log_claim,color=factor(hospital_count)),alpha=0.1)

tmp %>% ggplot() + geom_point(aes(x=time,y=log_claim,color=factor(rx_count)),alpha=0.3)

tmp %>% ggplot() + geom_point(aes(x=time,y=log_claim,color=factor(pcp_count)),alpha=0.3)

ggplot(data = tmp) +
geom_point(mapping = aes(x = time, y = log_claim),alpha=0.1) +
facet_grid(rows = vars(er_count), cols = vars(hospital_count))

tmp = control
log_claim = log(tmp$agg_claim)
#tmp %>% ggplot() + geom_density(aes(x=log_claim))
#tmp %>% ggplot() + geom_bar(aes(x=total_count))
tmp %>% filter(er_count>0) %>%
ggplot() + geom_point(aes(x=time,y=log(agg_claim),color=factor(er_count)),alpha=0.3) + ylim(4,10)
tmp %>% filter(hospital_count>0) %>%
ggplot() + geom_point(aes(x=time,y=log(agg_claim),color=factor(hospital_count)),alpha=0.3)+ ylim(4,10)
tmp %>% filter(rx_count>0) %>%
ggplot() + geom_point(aes(x=time,y=log(agg_claim),color=factor(rx_count)),alpha=0.3)+ ylim(4,10)
tmp %>% filter(pcp_count>0) %>%
ggplot() + geom_point(aes(x=time,y=log(agg_claim),color=factor(pcp_count)),alpha=0.3)+ ylim(4,10)
ggplot(data = tmp) +
geom_point(mapping = aes(x = time, y = log_claim),alpha=0.1) +
facet_grid(rows = vars(er_count), cols = vars(rx_count))
ggplot(data = tmp) +
geom_point(mapping = aes(x = time, y = log_claim),alpha=0.1) +
facet_grid(rows = vars(rx_count), cols = vars(er_count))
tmp = control
er_claim=0
hospital_claim=0
pcp_claim=0
rx_claim=0
er_claim = ifelse((er_count>0)&(hospital_count*pcp_count*rx_count==0),
agg_claim/er_count,
avg_claim)
hospital_claim = ifelse((hospital_count>0)&(er_count*pcp_count*rx_count==0),
agg_claim/hospital_count,
avg_claim)
pcp_claim = ifelse((pcp_count>0)&(er_count*hospital_count*rx_count==0),
agg_claim/pcp_count,
avg_claim)
rx_claim = ifelse((rx_count>0)&(hospital_count*pcp_count*pcp_count==0),
agg_claim/rx_count,
avg_claim)
tmp = tmp %>% cbind(er_claim,hospital_claim,pcp_claim,rx_claim)
log_claim = log(tmp$agg_claim)
#sort(((tmp$er_claim)),decreasing =T)
tmp %>% ggplot() + geom_point(aes(x=time,y=log(er_claim),color=factor(er_count)),alpha=0.1)+ ylim(4,10)
tmp %>% ggplot() + geom_point(aes(x=time,y=log(hospital_claim),color=factor(hospital_count)),alpha=0.1)+ ylim(4,10)
tmp %>% ggplot() + geom_point(aes(x=time,y=log(rx_claim),color=factor(rx_count)),alpha=0.3)+ ylim(4,10)
tmp %>% ggplot() + geom_point(aes(x=time,y=log(pcp_claim),color=factor(pcp_count)),alpha=0.3)+ ylim(4,10)
attach(control)
tmp = control
model1 = lm(log(agg_claim+1)~er_count*hospital_count*rx_count*pcp_count+
time+gender+age*factor(chronic_count))
summary(model1)
tmp = tmp %>% cbind(fit = fitted(model1),res = residuals(model1))
ggplot(data=tmp) + geom_point(mapping=aes(x=fit,y=res,color=region),alpha=0.5)
ggplot(data=tmp) + geom_boxplot(mapping=aes(x=factor(chronic_count),y=res))
ggplot(data=tmp) + geom_boxplot(mapping=aes(x=factor(age),y=res))
ggplot(data=tmp,mapping=aes(x=age,y=res)) + geom_point(alpha=0.1)
ggplot(data=tmp,mapping=aes(x=factor(chronic_count),y=res)) + geom_jitter(alpha=0.1)
ggplot(data = tmp) +
geom_point(mapping = aes(x = fit, y = res),alpha=0.1) +
facet_grid(rows = vars(cut(age,c(0,20,40,60,80,100,120))), cols = vars(factor(chronic_count)))
ggplot(data=tmp) + geom_jitter(mapping=aes(x=age,y=factor(chronic_count)),alpha=0.1)
ggplot(data=tmp) + geom_density(aes(x=age))
ggplot(data=tmp) + geom_bar(aes(x=factor(chronic_count)))
attach(impact)
tmp = impact
sin_term = sin(2*pi*impact$time)
cos_term = cos(2*pi*impact$time)
time_sq <- impact$time^2 # will use later
time_cu <- impact$time^3 # will use later
## Frequency model
model_total_count = glm(total_count ~ sin_term + cos_term + time +
region + rural + is_lowincome + age +
chronic_count + factor(crd),
family = poisson)
model1 = model_total_count
summary(model1)
tmp = tmp %>% cbind(fit = fitted(model1),res = residuals(model1))
#ggplot(data=tmp) + geom_point(mapping=aes(x=fit,y=res,color=factor(crd)))
#ggplot(data=tmp) + geom_point(mapping=aes(x=fit,y=res,color=factor(region)))
impact %>%
group_by(after_crd,time) %>%
summarize(monthly_total_count = mean(total_count),
monthly_agg_claim = mean(agg_claim),
monthly_avg_claim = mean(avg_claim)) %>%
ggplot(aes(x=time,y=monthly_total_count,color=after_crd))+
geom_point()+
geom_smooth(method="lm")
impact %>%
group_by(after_crd,time) %>%
summarize(monthly_total_count = mean(total_count),
monthly_agg_claim = mean(agg_claim),
monthly_avg_claim = mean(avg_claim)) %>%
ggplot(aes(x=time,y=monthly_agg_claim,color=after_crd))+
geom_point()+
geom_smooth(method="lm")
impact %>%
group_by(after_crd,time) %>%
summarize(monthly_total_count = mean(total_count),
monthly_agg_claim = mean(agg_claim),
monthly_avg_claim = mean(avg_claim)) %>%
ggplot(aes(x=time,y=monthly_avg_claim,color=after_crd))+
geom_point()+
geom_smooth(method="lm")
a = impact %>%
filter(after_crd==F) %>%
group_by(mth_out,time) %>%
summarize(monthly_total_count = mean(total_count),
monthly_agg_claim = mean(agg_claim),
monthly_avg_claim = mean(avg_claim))
b = impact %>%
filter(after_crd==T) %>%
group_by(mth_out,time) %>%
summarize(monthly_total_count = mean(total_count),
monthly_agg_claim = mean(agg_claim),
monthly_avg_claim = mean(avg_claim))
c = impact %>%
filter(is.na(after_crd)) %>%
group_by(mth_out,time) %>%
summarize(monthly_total_count = mean(total_count),
monthly_agg_claim = mean(agg_claim),
monthly_avg_claim = mean(avg_claim))
ggplot(mapping = aes(x=time,y=monthly_total_count,color=factor(mth_out)))+
#geom_point(data=a)+
geom_smooth(data=a,method="lm",se=F)+
#geom_point(data=b)+
geom_smooth(data=b,method="lm",se=F)+
geom_smooth(data=c,se=T)
not_control = impact %>% filter(!is.na(mth_out))
control = impact %>% filter(is.na(mth_out))
signif_level = 0.05
test_type = "greater"
### Raw comparison
tmp = not_control %>% group_by(crd,after_crd,id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key=after_crd,value=avg_mthly_claim) %>% ungroup()
is_effective=tmp$`TRUE`<tmp$`FALSE`
tmp = tmp %>% cbind(eff=is_effective)
x = tmp %>% group_by(crd) %>%
summarize(n_eff=sum(eff),
n_not_eff=sum(!eff),
n=n(),
eff_prop=mean(eff))
## Test for Normality
tmp = not_control %>% group_by(crd, after_crd, id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key = after_crd, value = avg_mthly_claim) %>% ungroup()
diff = tmp$`FALSE` - tmp$`TRUE`
shapiro.test(diff) #small p-value, not Normal, note sample size big
ks.test(diff,"pnorm") #small p-value, not Normal, note sample size big
ggqqplot(diff) #looking at qqplot and density, pretty Normal
ggplot()+geom_density(aes(x=diff))
## Test for significance in difference
#paired t test - assume Normal
not_control %>% group_by(crd,
after_crd, id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key = after_crd, value = avg_mthly_claim) %>%
ungroup() %>% group_by(crd) %>%
do(tidy(t.test(.$`FALSE`, .$`TRUE`, data=.,
alternative = test_type, paired = TRUE))) %>%
mutate(is_eff = p.value<signif_level) %>%
select(crd,
is_eff) %>% data.frame(x)
#wilcoxon signed-rank test - not Normal
not_control %>% group_by(crd,
after_crd, id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key = after_crd, value = avg_mthly_claim) %>%
ungroup() %>% group_by(crd) %>%
do(tidy(wilcox.test(.$`FALSE`, .$`TRUE`, data=.,
alternative = test_type, paired = TRUE))) %>%
mutate(is_eff = p.value<signif_level) %>%
select(crd,
is_eff) %>% data.frame(x)
tmp = not_control %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd,
after_crd,id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key=after_crd,value=avg_mthly_claim) %>% ungroup()
is_effective=tmp$`TRUE`<tmp$`FALSE`
tmp = tmp %>% cbind(eff=is_effective)
x = tmp %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd) %>%
summarize(n_eff=sum(eff),
n=sum(eff+!eff),
eff_prop=mean(eff))
signif_level = 0.05
test_type = "greater"
### Raw comparison
tmp = not_control %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd,
after_crd,id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key=after_crd,value=avg_mthly_claim) %>% ungroup()
is_effective=tmp$`TRUE`<tmp$`FALSE`
tmp = tmp %>% cbind(eff=is_effective)
x = tmp %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd) %>%
summarize(n_eff=sum(eff),
n_not_eff=sum(!eff),
n=n(),
eff_prop=mean(eff))
## Test for Normality
tmp = not_control %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd,
after_crd, id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key = after_crd, value = avg_mthly_claim) %>% ungroup()
diff = tmp$`FALSE` - tmp$`TRUE`
shapiro.test(diff) #small p-value, not Normal, note sample size big
ks.test(diff,"pnorm") #small p-value, not Normal, note sample size big
ggqqplot(diff) #looking at qqplot and density, pretty Normal
ggplot()+geom_density(aes(x=diff))
## Test for significance in difference
#paired t test - assume Normal
not_control %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd,
after_crd, id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key = after_crd, value = avg_mthly_claim) %>%
ungroup() %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd) %>%
do(tidy(t.test(.$`FALSE`, .$`TRUE`, data=.,
alternative = test_type, paired = TRUE))) %>%
mutate(is_eff = p.value<signif_level) %>%
select(age,gender,chronic_count,
is_lowincome,rural,region,crd,
is_eff) %>% data.frame(x)
#wilcoxon signed-rank test - not Normal
x = not_control %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd,
after_crd, id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key = after_crd, value = avg_mthly_claim) %>%
ungroup() %>% group_by(age,gender,chronic_count,
is_lowincome,rural,region,crd) %>%
do(tidy(wilcox.test(.$`FALSE`, .$`TRUE`, data=.,
alternative = test_type, paired = TRUE))) %>%
mutate(is_eff = p.value<signif_level) %>%
select(age,gender,chronic_count,
is_lowincome,rural,region,crd,
is_eff) %>% data.frame(x) %>%
select(age,gender,chronic_count,
is_lowincome,rural,region,crd,
n_eff,n_not_eff,n,eff_prop,
is_eff)
t=list()
tmp=list()
for(i in 1:4){
tmp[[i]] = not_control %>% filter(crd == i)
df = tmp[[i]] %>% group_by(after_crd,id) %>%
summarize(avg_mthly_claim = mean(agg_claim)) %>%
spread(key=after_crd,value=avg_mthly_claim) %>% ungroup()
is_effective=ifelse(df$`TRUE`<df$`FALSE`,"Y","N")
df = df %>% cbind(is_eff=is_effective)
df = df %>% inner_join(demo,by="id") %>%
select(-c("FALSE","TRUE","id","mth_out","crd")) %>%
mutate(is_lowincome=as.factor(is_lowincome),
region=as.factor(region),
rural=as.factor(rural),
gender=as.factor(gender)
)
t[[i]] = C5.0(is_eff~.-is_eff, data=df)
plot(t[[i]])
print(summary(t[[i]]))
}

##
## Call:
## C5.0.formula(formula = is_eff ~ . - is_eff, data = df)
##
##
## C5.0 [Release 2.07 GPL Edition] Mon Mar 2 03:24:25 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 1430 cases (7 attributes) from undefined.data
##
## Decision tree:
##
## is_lowincome = FALSE:
## :...region in {Northeast,Southwest}: N (324/130)
## : region = Northwest:
## : :...chronic_count <= 1: N (39/6)
## : : chronic_count > 1:
## : : :...rural in {Rural,Semi-Rural,Urban}: N (46/14)
## : : rural = Suburban: Y (22/8)
## : region = Central:
## : :...rural in {Rural,Semi-Rural}: N (68/32)
## : : rural = Suburban: Y (77/30)
## : : rural = Urban:
## : : :...chronic_count > 4: N (6)
## : : chronic_count <= 4:
## : : :...gender = F: Y (64/28)
## : : gender = M: N (25/7)
## : region = Southeast:
## : :...gender = M: Y (83/39)
## : gender = F:
## : :...rural in {Suburban,Urban}: N (181/72)
## : rural = Rural:
## : :...age <= 73: N (11/3)
## : : age > 73: Y (2)
## : rural = Semi-Rural:
## : :...age <= 73: N (37/12)
## : age > 73: Y (11/2)
## is_lowincome = TRUE:
## :...chronic_count > 4: N (31/9)
## chronic_count <= 4:
## :...rural = Semi-Rural: Y (62/24)
## rural = Rural:
## :...region in {Northeast,Southwest}: N (7/2)
## : region = Northwest: Y (5/1)
## : region = Central:
## : :...age <= 64: N (7/1)
## : : age > 64: Y (5/1)
## : region = Southeast:
## : :...chronic_count <= 2: N (7/2)
## : chronic_count > 2: Y (6/1)
## rural = Urban:
## :...chronic_count <= 0:
## : :...age <= 77: Y (28/11)
## : : age > 77: N (4/1)
## : chronic_count > 0:
## : :...age <= 66: N (71/25)
## : age > 66: Y (65/29)
## rural = Suburban:
## :...region in {Central,Southwest}: Y (39/16)
## region = Northwest:
## :...chronic_count <= 1: N (3)
## : chronic_count > 1: Y (10/2)
## region = Northeast:
## :...age <= 50: N (3)
## : age > 50:
## : :...chronic_count <= 3: Y (17/4)
## : chronic_count > 3: N (4/1)
## region = Southeast:
## :...chronic_count <= 1: N (26/10)
## chronic_count > 1:
## :...chronic_count <= 3: Y (26/7)
## chronic_count > 3: N (8/2)
##
##
## Evaluation on training data (1430 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 36 532(37.2%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 579 203 (a): class N
## 329 319 (b): class Y
##
##
## Attribute usage:
##
## 100.00% is_lowincome
## 81.75% region
## 66.64% rural
## 44.48% chronic_count
## 28.95% gender
## 18.53% age
##
##
## Time: 0.0 secs

##
## Call:
## C5.0.formula(formula = is_eff ~ . - is_eff, data = df)
##
##
## C5.0 [Release 2.07 GPL Edition] Mon Mar 2 03:24:26 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 1509 cases (7 attributes) from undefined.data
##
## Decision tree:
##
## is_lowincome = TRUE:
## :...rural = Suburban: N (283/120)
## : rural = Rural:
## : :...gender = F: Y (16/5)
## : : gender = M:
## : : :...chronic_count <= 0: Y (4)
## : : chronic_count > 0: N (51/19)
## : rural = Semi-Rural:
## : :...region in {Northeast,Southeast}: Y (101/46)
## : : region in {Northwest,Southwest}: N (26/9)
## : : region = Central:
## : : :...chronic_count <= 0: Y (10/2)
## : : chronic_count > 0: N (41/18)
## : rural = Urban:
## : :...region in {Central,Northwest}: Y (143/62)
## : region = Northeast: N (58/25)
## : region = Southeast:
## : :...chronic_count <= 4: Y (82/40)
## : : chronic_count > 4: N (9)
## : region = Southwest:
## : :...gender = F: N (6/2)
## : gender = M: Y (33/14)
## is_lowincome = FALSE:
## :...chronic_count <= 1: N (256/96)
## chronic_count > 1:
## :...rural = Rural: N (38/14)
## rural = Suburban:
## :...chronic_count <= 2: Y (52/16)
## : chronic_count > 2:
## : :...age <= 64: Y (31/14)
## : age > 64: N (33/11)
## rural = Semi-Rural:
## :...chronic_count <= 2: N (31/9)
## : chronic_count > 2:
## : :...chronic_count <= 3: Y (21/10)
## : chronic_count > 3:
## : :...age <= 66: Y (9/1)
## : age > 66: N (14/5)
## rural = Urban:
## :...region in {Northeast,Southwest}: N (55/20)
## region = Northwest: Y (11/5)
## region = Southeast:
## :...age <= 78: Y (47/22)
## : age > 78: N (5)
## region = Central:
## :...age > 71: Y (11/1)
## age <= 71:
## :...chronic_count <= 4: N (27/11)
## chronic_count > 4: Y (5/1)
##
##
## Evaluation on training data (1509 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 30 598(39.6%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 574 239 (a): class N
## 359 337 (b): class Y
##
##
## Attribute usage:
##
## 100.00% is_lowincome
## 83.04% rural
## 55.86% chronic_count
## 44.40% region
## 12.06% age
## 7.29% gender
##
##
## Time: 0.0 secs

##
## Call:
## C5.0.formula(formula = is_eff ~ . - is_eff, data = df)
##
##
## C5.0 [Release 2.07 GPL Edition] Mon Mar 2 03:24:27 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 777 cases (7 attributes) from undefined.data
##
## Decision tree:
##
## chronic_count <= 2:
## :...rural = Rural:
## : :...age <= 63: N (13/1)
## : : age > 63: Y (28/11)
## : rural = Suburban:
## : :...is_lowincome = FALSE:
## : : :...region in {Northwest,Southeast}: N (47/18)
## : : : region = Central:
## : : : :...age <= 65: N (15/5)
## : : : : age > 65: Y (11/1)
## : : : region = Northeast:
## : : : :...age <= 56: Y (5/1)
## : : : : age > 56: N (13/5)
## : : : region = Southwest:
## : : : :...chronic_count <= 0: N (2)
## : : : chronic_count > 0:
## : : : :...chronic_count <= 1: Y (4/1)
## : : : chronic_count > 1: N (4/1)
## : : is_lowincome = TRUE:
## : : :...region in {Central,Northwest}: N (14/5)
## : : region = Northeast:
## : : :...chronic_count <= 0: Y (4)
## : : : chronic_count > 0: N (10/4)
## : : region = Southwest:
## : : :...chronic_count <= 0: Y (2)
## : : : chronic_count > 0: N (4/1)
## : : region = Southeast:
## : : :...chronic_count <= 1: Y (10/1)
## : : chronic_count > 1:
## : : :...age <= 66: Y (4)
## : : age > 66: N (3)
## : rural = Semi-Rural:
## : :...region = Central: N (18/5)
## : : region = Southwest: Y (15/3)
## : : region = Northeast:
## : : :...chronic_count <= 0: Y (3)
## : : : chronic_count > 0: N (16/5)
## : : region = Northwest:
## : : :...is_lowincome = TRUE: N (4/1)
## : : : is_lowincome = FALSE:
## : : : :...chronic_count <= 0: N (3/1)
## : : : chronic_count > 0: Y (6/1)
## : : region = Southeast:
## : : :...chronic_count <= 0: Y (5)
## : : chronic_count > 0:
## : : :...chronic_count <= 1: Y (8/2)
## : : chronic_count > 1:
## : : :...age > 77: Y (3)
## : : age <= 77:
## : : :...age <= 60: Y (7/2)
## : : age > 60: N (7)
## : rural = Urban:
## : :...region = Central:
## : :...gender = F: Y (28/10)
## : : gender = M: N (25/9)
## : region = Northwest:
## : :...gender = F: N (10/3)
## : : gender = M:
## : : :...is_lowincome = FALSE: Y (2)
## : : is_lowincome = TRUE: N (3/1)
## : region = Southwest:
## : :...gender = M: Y (11/3)
## : : gender = F:
## : : :...chronic_count <= 0: Y (4/1)
## : : chronic_count > 0: N (12/3)
## : region = Northeast:
## : :...is_lowincome = FALSE:
## : : :...gender = F:
## : : : :...chronic_count <= 1: N (12/4)
## : : : : chronic_count > 1: Y (8/3)
## : : : gender = M:
## : : : :...age <= 60: N (2)
## : : : age > 60: Y (2)
## : : is_lowincome = TRUE:
## : : :...chronic_count <= 0: N (3)
## : : chronic_count > 0:
## : : :...age <= 65: Y (5/1)
## : : age > 65: N (5/1)
## : region = Southeast:
## : :...is_lowincome = TRUE:
## : :...age <= 66: Y (8)
## : : age > 66:
## : : :...age <= 82: N (13/3)
## : : age > 82: Y (2)
## : is_lowincome = FALSE:
## : :...chronic_count <= 0:
## : :...gender = M: N (8/3)
## : : gender = F:
## : : :...age <= 61: Y (4)
## : : age > 61: N (4/1)
## : chronic_count > 0:
## : :...gender = F: N (23/7)
## : gender = M:
## : :...age <= 71: Y (5/1)
## : age > 71: N (3)
## chronic_count > 2:
## :...region = Northwest: N (23/6)
## region = Northeast:
## :...rural = Semi-Rural: Y (17/6)
## : rural = Urban: N (26/5)
## : rural = Rural:
## : :...chronic_count > 4: N (2)
## : : chronic_count <= 4:
## : : :...age <= 72: N (2)
## : : age > 72: Y (2)
## : rural = Suburban:
## : :...age > 73: N (4)
## : age <= 73:
## : :...gender = F: N (3/1)
## : gender = M: Y (4/1)
## region = Southwest:
## :...chronic_count > 4: N (8/1)
## : chronic_count <= 4:
## : :...rural = Rural: N (3/1)
## : rural = Semi-Rural:
## : :...age <= 64: Y (2)
## : : age > 64: N (3/1)
## : rural = Suburban:
## : :...is_lowincome = FALSE: N (3/1)
## : : is_lowincome = TRUE: Y (6/1)
## : rural = Urban:
## : :...is_lowincome = FALSE: Y (8/2)
## : is_lowincome = TRUE: N (6)
## region = Central:
## :...rural = Rural:
## : :...chronic_count <= 3: Y (4/1)
## : : chronic_count > 3: N (8/2)
## : rural = Suburban:
## : :...age <= 55: N (4)
## : : age > 55: Y (15/6)
## : rural = Semi-Rural:
## : :...age <= 66: N (9)
## : : age > 66:
## : : :...chronic_count > 3: Y (2)
## : : chronic_count <= 3:
## : : :...age <= 75: N (2)
## : : age > 75: Y (2)
## : rural = Urban:
## : :...chronic_count <= 3: N (14/6)
## : chronic_count > 3:
## : :...chronic_count > 4: Y (9/4)
## : chronic_count <= 4:
## : :...age <= 60: N (5)
## : age > 60: Y (6/1)
## region = Southeast:
## :...is_lowincome = FALSE: N (57/21)
## is_lowincome = TRUE:
## :...age > 78: N (3)
## age <= 78:
## :...rural = Rural: Y (0)
## rural = Semi-Rural:
## :...chronic_count <= 3: N (2)
## : chronic_count > 3: Y (4/1)
## rural = Urban:
## :...age <= 51: N (2)
## : age > 51: Y (8/1)
## rural = Suburban:
## :...chronic_count <= 3: Y (4/1)
## chronic_count > 3:
## :...age <= 69: N (2)
## age > 69: Y (3/1)
##
##
## Evaluation on training data (777 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 92 200(25.7%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 355 68 (a): class N
## 132 222 (b): class Y
##
##
## Attribute usage:
##
## 100.00% chronic_count
## 94.72% region
## 88.29% rural
## 49.55% is_lowincome
## 32.82% age
## 22.27% gender
##
##
## Time: 0.0 secs

##
## Call:
## C5.0.formula(formula = is_eff ~ . - is_eff, data = df)
##
##
## C5.0 [Release 2.07 GPL Edition] Mon Mar 2 03:24:34 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 1207 cases (7 attributes) from undefined.data
##
## Decision tree:
##
## is_lowincome = FALSE:
## :...chronic_count <= 1: N (191/71)
## : chronic_count > 1:
## : :...rural in {Semi-Rural,Urban}: Y (205/92)
## : rural = Rural:
## : :...chronic_count <= 4: N (18/6)
## : : chronic_count > 4: Y (4)
## : rural = Suburban:
## : :...region in {Northwest,Southeast,Southwest}: N (52/19)
## : region = Central:
## : :...age <= 73: N (18/2)
## : : age > 73: Y (5/1)
## : region = Northeast:
## : :...age <= 71: Y (22/5)
## : age > 71: N (3)
## is_lowincome = TRUE:
## :...rural = Semi-Rural: Y (126/51)
## rural = Rural:
## :...gender = F: Y (10/4)
## : gender = M: N (45/17)
## rural = Suburban:
## :...region in {Central,Southwest}: N (83/38)
## : region in {Northeast,Northwest}: Y (75/29)
## : region = Southeast:
## : :...age <= 71: Y (42/17)
## : age > 71:
## : :...chronic_count <= 1: Y (6/2)
## : chronic_count > 1: N (12/1)
## rural = Urban:
## :...region = Central: Y (80/31)
## region = Northwest:
## :...chronic_count <= 3: Y (23/10)
## : chronic_count > 3: N (6/2)
## region = Southeast:
## :...age <= 50: Y (11/1)
## : age > 50: N (79/33)
## region = Southwest:
## :...gender = F: N (4/1)
## : gender = M: Y (25/8)
## region = Northeast:
## :...gender = F: Y (7/1)
## gender = M:
## :...age <= 67: Y (32/10)
## age > 67: N (23/5)
##
##
## Evaluation on training data (1207 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 27 457(37.9%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 339 262 (a): class N
## 195 411 (b): class Y
##
##
## Attribute usage:
##
## 100.00% is_lowincome
## 84.18% rural
## 50.37% region
## 46.81% chronic_count
## 20.96% age
## 12.10% gender
##
##
## Time: 0.0 secs